/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"
.text
.align 5

//void MatmulInt8DpNeon64(const int8_t *a, const int8_t *b, int8_t *dst, int row8, int col8, int deep4, 
//                      const int *a_sums, const int *bias, int act_min, int act_max, int out_zp,
//                      const int *multiplier, const int *left_shift, const int *right_shift, int row,
//                      int col, int stride, int peroc);

// x0: a(left matrix ptr)
// x1: b(right matrix ptr)
// x2: out ptr
// w3: row8
// w4: col8
// w5: deep4
// x6: a_sums
// x7: bias
// w8: act_min
// w9: act_max
// w10: out_zp
// x11: multiplier
// x12: left_shift
// x13: right_shift
// w14: row
// w15: col
// w24: stride
// w27: filter_peroc

asm_function MatmulInt8DpNeon64
  sub sp, sp, #208
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
  stp x19, x20, [sp], #16
  stp x21, x22, [sp], #16
  stp x23, x24, [sp], #16
  stp x25, x26, [sp], #16
  stp x27, x28, [sp], #16

  ldr w8, [sp]
  ldr w9, [sp, #8]
  ldr w10, [sp, #16]
  ldr x11, [sp, #24]
  ldr x12, [sp, #32]
  ldr x13, [sp, #40]
  ldr w14, [sp, #48]
  ldr w15, [sp, #56]
  ldr w24, [sp, #64]
  ldr w27, [sp, #72]

  mov w17, #8       // sizeof(int8)*8
  mul w21, w5, w17  // the stride of a/b: sizeof(int8)*8*deep4
  mov x25, x2
L1:
  cmp w4, #0      // if at the end of col8
  beq End1

  mov w16, w3     // reset a row8 counter
  mov w23, w14    // reset a row counter
  mov x17, x0     // reload a ptr
  mov x22, x6     // reload a_sums ptr 
L2:
  cmp w16, #0
  beq End2

  mov x28, x1     // reload b ptr
  mov x19, x7     // reload bias ptr
  mov w20, w5     // reload depth
  dup v16.4s, wzr
  dup v17.4s, wzr
  dup v18.4s, wzr
  dup v19.4s, wzr
  dup v20.4s, wzr
  dup v21.4s, wzr
  dup v22.4s, wzr
  dup v23.4s, wzr
  dup v24.4s, wzr
  dup v25.4s, wzr
  dup v26.4s, wzr
  dup v27.4s, wzr
  dup v28.4s, wzr
  dup v29.4s, wzr
  dup v30.4s, wzr
  dup v31.4s, wzr
L3:
  cmp w20, #16
  blt LoopD4

LoopD16:
  ld1 {v0.16b, v1.16b}, [x17], #32
  ld1 {v2.16b, v3.16b}, [x28], #32

  sdot v16.4s, v2.16b, v0.4b[0]
  sdot v18.4s, v2.16b, v0.4b[1]
  sdot v20.4s, v2.16b, v0.4b[2]
  sdot v22.4s, v2.16b, v0.4b[3]

  ld1 {v4.16b, v5.16b}, [x17], #32
  sdot v24.4s, v2.16b, v1.4b[0]
  sdot v26.4s, v2.16b, v1.4b[1]
  sdot v28.4s, v2.16b, v1.4b[2]
  sdot v30.4s, v2.16b, v1.4b[3]

  ld1 {v6.16b, v7.16b}, [x28], #32
  sdot v17.4s, v3.16b, v0.4b[0]
  sdot v19.4s, v3.16b, v0.4b[1]
  sdot v21.4s, v3.16b, v0.4b[2]
  sdot v23.4s, v3.16b, v0.4b[3]

  sdot v25.4s, v3.16b, v1.4b[0]
  sdot v27.4s, v3.16b, v1.4b[1]
  sdot v29.4s, v3.16b, v1.4b[2]
  sdot v31.4s, v3.16b, v1.4b[3]

  ld1 {v8.16b, v9.16b}, [x17], #32
  sdot v16.4s, v6.16b, v4.4b[0]
  sdot v18.4s, v6.16b, v4.4b[1]
  sdot v20.4s, v6.16b, v4.4b[2]
  sdot v22.4s, v6.16b, v4.4b[3]

  sdot v24.4s, v6.16b, v5.4b[0]
  sdot v26.4s, v6.16b, v5.4b[1]
  sdot v28.4s, v6.16b, v5.4b[2]
  sdot v30.4s, v6.16b, v5.4b[3]

  ld1 {v10.16b, v11.16b}, [x28], #32
  sdot v17.4s, v7.16b, v4.4b[0]
  sdot v19.4s, v7.16b, v4.4b[1]
  sdot v21.4s, v7.16b, v4.4b[2]
  sdot v23.4s, v7.16b, v4.4b[3]

  sdot v25.4s, v7.16b, v5.4b[0]
  sdot v27.4s, v7.16b, v5.4b[1]
  sdot v29.4s, v7.16b, v5.4b[2]
  sdot v31.4s, v7.16b, v5.4b[3]

  ld1 {v12.16b, v13.16b}, [x17], #32
  sdot v16.4s, v10.16b, v8.4b[0]
  sdot v18.4s, v10.16b, v8.4b[1]
  sdot v20.4s, v10.16b, v8.4b[2]
  sdot v22.4s, v10.16b, v8.4b[3]

  sdot v24.4s, v10.16b, v9.4b[0]
  sdot v26.4s, v10.16b, v9.4b[1]
  sdot v28.4s, v10.16b, v9.4b[2]
  sdot v30.4s, v10.16b, v9.4b[3]

  ld1 {v14.16b, v15.16b}, [x28], #32
  sdot v17.4s, v11.16b, v8.4b[0]
  sdot v19.4s, v11.16b, v8.4b[1]
  sdot v21.4s, v11.16b, v8.4b[2]
  sdot v23.4s, v11.16b, v8.4b[3]

  sdot v25.4s, v11.16b, v9.4b[0]
  sdot v27.4s, v11.16b, v9.4b[1]
  sdot v29.4s, v11.16b, v9.4b[2]
  sdot v31.4s, v11.16b, v9.4b[3]

  sdot v16.4s, v14.16b, v12.4b[0]
  sdot v18.4s, v14.16b, v12.4b[1]
  sdot v20.4s, v14.16b, v12.4b[2]
  sdot v22.4s, v14.16b, v12.4b[3]

  sdot v24.4s, v14.16b, v13.4b[0]
  sdot v26.4s, v14.16b, v13.4b[1]
  sdot v28.4s, v14.16b, v13.4b[2]
  sdot v30.4s, v14.16b, v13.4b[3]

  sdot v17.4s, v15.16b, v12.4b[0]
  sdot v19.4s, v15.16b, v12.4b[1]
  sdot v21.4s, v15.16b, v12.4b[2]
  sdot v23.4s, v15.16b, v12.4b[3]

  sdot v25.4s, v15.16b, v13.4b[0]
  sdot v27.4s, v15.16b, v13.4b[1]
  sdot v29.4s, v15.16b, v13.4b[2]
  sdot v31.4s, v15.16b, v13.4b[3]

  subs w20, w20, #16  // depth - 16
  b L3

LoopD4:
  cmp w20, #0
  beq End3

  ld1 {v0.16b, v1.16b}, [x17], #32
  ld1 {v2.16b, v3.16b}, [x28], #32

  sdot v16.4s, v2.16b, v0.4b[0]
  sdot v18.4s, v2.16b, v0.4b[1]
  sdot v20.4s, v2.16b, v0.4b[2]
  sdot v22.4s, v2.16b, v0.4b[3]
  sdot v24.4s, v2.16b, v1.4b[0]
  sdot v26.4s, v2.16b, v1.4b[1]
  sdot v28.4s, v2.16b, v1.4b[2]
  sdot v30.4s, v2.16b, v1.4b[3]
  sdot v17.4s, v3.16b, v0.4b[0]
  sdot v19.4s, v3.16b, v0.4b[1]
  sdot v21.4s, v3.16b, v0.4b[2]
  sdot v23.4s, v3.16b, v0.4b[3]
  sdot v25.4s, v3.16b, v1.4b[0]
  sdot v27.4s, v3.16b, v1.4b[1]
  sdot v29.4s, v3.16b, v1.4b[2]
  sdot v31.4s, v3.16b, v1.4b[3]
  
  subs w20, w20, #4  // depth - 4
  b LoopD4

End3:
  // Add (Bias+Depth*Za*Zb-Za*Bsums)
  ld1 {v15.4s}, [x19], #16  
  ld1 {v14.4s}, [x19], #16  
  add v16.4s, v16.4s, v15.4s
  add v18.4s, v18.4s, v15.4s
  add v20.4s, v20.4s, v15.4s
  add v22.4s, v22.4s, v15.4s
  add v24.4s, v24.4s, v15.4s
  add v26.4s, v26.4s, v15.4s
  add v28.4s, v28.4s, v15.4s
  add v30.4s, v30.4s, v15.4s
  add v17.4s, v17.4s, v14.4s
  add v19.4s, v19.4s, v14.4s
  add v21.4s, v21.4s, v14.4s
  add v23.4s, v23.4s, v14.4s
  add v25.4s, v25.4s, v14.4s
  add v27.4s, v27.4s, v14.4s
  add v29.4s, v29.4s, v14.4s
  add v31.4s, v31.4s, v14.4s

  cmp w27, #0
  beq PerTSumLoad
PerCSumLoad:
  ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6], #64
  ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x6], #64
  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x6], #64
  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x6], #64
  b ApplySum
PerTSumLoad:
  ld1 {v14.4s}, [x22], #16
  ld1 {v15.4s}, [x22], #16
  dup v0.4s, v14.s[0]
  dup v1.4s, v14.s[0]
  dup v2.4s, v14.s[1]
  dup v3.4s, v14.s[1]
  dup v4.4s, v14.s[2]
  dup v5.4s, v14.s[2]
  dup v6.4s, v14.s[3]
  dup v7.4s, v14.s[3]
  dup v8.4s, v15.s[0]
  dup v9.4s, v15.s[0]
  dup v10.4s, v15.s[1]
  dup v11.4s, v15.s[1]
  dup v12.4s, v15.s[2]
  dup v13.4s, v15.s[2]
  dup v14.4s, v15.s[3]
  dup v15.4s, v14.s[0]
ApplySum:
  // Subtract (Asums*Zb)
  sub v16.4s, v16.4s, v0.4s
  sub v17.4s, v17.4s, v1.4s
  sub v18.4s, v18.4s, v2.4s
  sub v19.4s, v19.4s, v3.4s
  sub v20.4s, v20.4s, v4.4s
  sub v21.4s, v21.4s, v5.4s
  sub v22.4s, v22.4s, v6.4s
  sub v23.4s, v23.4s, v7.4s
  sub v24.4s, v24.4s, v8.4s
  sub v25.4s, v25.4s, v9.4s
  sub v26.4s, v26.4s, v10.4s
  sub v27.4s, v27.4s, v11.4s
  sub v28.4s, v28.4s, v12.4s
  sub v29.4s, v29.4s, v13.4s
  sub v30.4s, v30.4s, v14.4s
  sub v31.4s, v31.4s, v15.4s

  cmp w27, #0
  beq PerTRoundLoad
PerCRoundLoad:
  ld1 {v8.4s, v9.4s}, [x12]
  ld1 {v10.4s, v11.4s}, [x11]
  ld1 {v12.4s, v13.4s}, [x13]
  b ApplyRound
PerTRoundLoad:
  ld1 {v14.s}[0], [x12]
  dup v8.4s, v14.s[0]
  dup v9.4s, v14.s[0]
  ld1 {v14.s}[0], [x11]
  dup v10.4s, v14.s[0]
  dup v11.4s, v14.s[0]
  ld1 {v14.s}[0], [x13]
  dup v12.4s, v14.s[0]
  dup v13.4s, v14.s[0]
ApplyRound:
  // Apply left shift
  sqshl v16.4s, v16.4s, v8.4s
  sqshl v17.4s, v17.4s, v9.4s
  sqshl v18.4s, v18.4s, v8.4s
  sqshl v19.4s, v19.4s, v9.4s
  sqshl v20.4s, v20.4s, v8.4s
  sqshl v21.4s, v21.4s, v9.4s
  sqshl v22.4s, v22.4s, v8.4s
  sqshl v23.4s, v23.4s, v9.4s
  sqshl v24.4s, v24.4s, v8.4s
  sqshl v25.4s, v25.4s, v9.4s
  sqshl v26.4s, v26.4s, v8.4s
  sqshl v27.4s, v27.4s, v9.4s
  sqshl v28.4s, v28.4s, v8.4s
  sqshl v29.4s, v29.4s, v9.4s
  sqshl v30.4s, v30.4s, v8.4s
  sqshl v31.4s, v31.4s, v9.4s

  // Apply the fixed-point part of the multiplier.
  sqrdmulh v16.4s, v16.4s, v10.4s
  sqrdmulh v17.4s, v17.4s, v11.4s
  sqrdmulh v18.4s, v18.4s, v10.4s
  sqrdmulh v19.4s, v19.4s, v11.4s
  sqrdmulh v20.4s, v20.4s, v10.4s
  sqrdmulh v21.4s, v21.4s, v11.4s
  sqrdmulh v22.4s, v22.4s, v10.4s
  sqrdmulh v23.4s, v23.4s, v11.4s
  sqrdmulh v24.4s, v24.4s, v10.4s
  sqrdmulh v25.4s, v25.4s, v11.4s
  sqrdmulh v26.4s, v26.4s, v10.4s
  sqrdmulh v27.4s, v27.4s, v11.4s
  sqrdmulh v28.4s, v28.4s, v10.4s
  sqrdmulh v29.4s, v29.4s, v11.4s
  sqrdmulh v30.4s, v30.4s, v10.4s
  sqrdmulh v31.4s, v31.4s, v11.4s

  // Apply right shift
  and v0.16b, v12.16b, v16.16b
  sshr v0.4s, v0.4s, #31
  sqadd v16.4s, v16.4s, v0.4s
  srshl v16.4s, v16.4s, v12.4s
  and v1.16b, v13.16b, v17.16b
  sshr v1.4s, v1.4s, #31
  sqadd v17.4s, v17.4s, v1.4s
  srshl v17.4s, v17.4s, v13.4s
  and v2.16b, v12.16b, v18.16b
  sshr v2.4s, v2.4s, #31
  sqadd v18.4s, v18.4s, v2.4s
  srshl v18.4s, v18.4s, v12.4s
  and v3.16b, v13.16b, v19.16b
  sshr v3.4s, v3.4s, #31
  sqadd v19.4s, v19.4s, v3.4s
  srshl v19.4s, v19.4s, v13.4s
  and v0.16b, v12.16b, v20.16b
  sshr v0.4s, v0.4s, #31
  sqadd v20.4s, v20.4s, v0.4s
  srshl v20.4s, v20.4s, v12.4s
  and v1.16b, v13.16b, v21.16b
  sshr v1.4s, v1.4s, #31
  sqadd v21.4s, v21.4s, v1.4s
  srshl v21.4s, v21.4s, v13.4s
  and v2.16b, v12.16b, v22.16b
  sshr v2.4s, v2.4s, #31
  sqadd v22.4s, v22.4s, v2.4s
  srshl v22.4s, v22.4s, v12.4s
  and v3.16b, v13.16b, v23.16b
  sshr v3.4s, v3.4s, #31
  sqadd v23.4s, v23.4s, v3.4s
  srshl v23.4s, v23.4s, v13.4s
  and v0.16b, v12.16b, v24.16b
  sshr v0.4s, v0.4s, #31
  sqadd v24.4s, v24.4s, v0.4s
  srshl v24.4s, v24.4s, v12.4s
  and v1.16b, v13.16b, v25.16b
  sshr v1.4s, v1.4s, #31
  sqadd v25.4s, v25.4s, v1.4s
  srshl v25.4s, v25.4s, v13.4s
  and v2.16b, v12.16b, v26.16b
  sshr v2.4s, v2.4s, #31
  sqadd v26.4s, v26.4s, v2.4s
  srshl v26.4s, v26.4s, v12.4s
  and v3.16b, v13.16b, v27.16b
  sshr v3.4s, v3.4s, #31
  sqadd v27.4s, v27.4s, v3.4s
  srshl v27.4s, v27.4s, v13.4s
  and v0.16b, v12.16b, v28.16b
  sshr v0.4s, v0.4s, #31
  sqadd v28.4s, v28.4s, v0.4s
  srshl v28.4s, v28.4s, v12.4s
  and v1.16b, v13.16b, v29.16b
  sshr v1.4s, v1.4s, #31
  sqadd v29.4s, v29.4s, v1.4s
  srshl v29.4s, v29.4s, v13.4s
  and v2.16b, v12.16b, v30.16b
  sshr v2.4s, v2.4s, #31
  sqadd v30.4s, v30.4s, v2.4s
  srshl v30.4s, v30.4s, v12.4s
  and v3.16b, v13.16b, v31.16b
  sshr v3.4s, v3.4s, #31
  sqadd v31.4s, v31.4s, v3.4s
  srshl v31.4s, v31.4s, v13.4s

  // Add the destination zero point
  dup v8.4s, w10
  add v16.4s, v16.4s, v8.4s
  add v17.4s, v17.4s, v8.4s
  add v18.4s, v18.4s, v8.4s
  add v19.4s, v19.4s, v8.4s
  add v20.4s, v20.4s, v8.4s
  add v21.4s, v21.4s, v8.4s
  add v22.4s, v22.4s, v8.4s
  add v23.4s, v23.4s, v8.4s
  add v24.4s, v24.4s, v8.4s
  add v25.4s, v25.4s, v8.4s
  add v26.4s, v26.4s, v8.4s
  add v27.4s, v27.4s, v8.4s
  add v28.4s, v28.4s, v8.4s
  add v29.4s, v29.4s, v8.4s
  add v30.4s, v30.4s, v8.4s
  add v31.4s, v31.4s, v8.4s

  // Apply the act_min bound
  dup v7.4s, w8
  smax v16.4s, v16.4s, v7.4s
  smax v17.4s, v17.4s, v7.4s
  smax v18.4s, v18.4s, v7.4s
  smax v19.4s, v19.4s, v7.4s
  smax v20.4s, v20.4s, v7.4s
  smax v21.4s, v21.4s, v7.4s
  smax v22.4s, v22.4s, v7.4s
  smax v23.4s, v23.4s, v7.4s
  smax v24.4s, v24.4s, v7.4s
  smax v25.4s, v25.4s, v7.4s
  smax v26.4s, v26.4s, v7.4s
  smax v27.4s, v27.4s, v7.4s
  smax v28.4s, v28.4s, v7.4s
  smax v29.4s, v29.4s, v7.4s
  smax v30.4s, v30.4s, v7.4s
  smax v31.4s, v31.4s, v7.4s

  // Apply the act_max bound
  dup v6.4s, w9
  smin v16.4s, v16.4s, v6.4s
  smin v17.4s, v17.4s, v6.4s
  smin v18.4s, v18.4s, v6.4s
  smin v19.4s, v19.4s, v6.4s
  smin v20.4s, v20.4s, v6.4s
  smin v21.4s, v21.4s, v6.4s
  smin v22.4s, v22.4s, v6.4s
  smin v23.4s, v23.4s, v6.4s
  smin v24.4s, v24.4s, v6.4s
  smin v25.4s, v25.4s, v6.4s
  smin v26.4s, v26.4s, v6.4s
  smin v27.4s, v27.4s, v6.4s
  smin v28.4s, v28.4s, v6.4s
  smin v29.4s, v29.4s, v6.4s
  smin v30.4s, v30.4s, v6.4s
  smin v31.4s, v31.4s, v6.4s

  // int32 -> int16
  sqxtn v0.4h, v16.4s
  sqxtn2 v0.8h, v17.4s
  sqxtn v1.4h, v18.4s
  sqxtn2 v1.8h, v19.4s
  sqxtn v2.4h, v20.4s
  sqxtn2 v2.8h, v21.4s
  sqxtn v3.4h, v22.4s
  sqxtn2 v3.8h, v23.4s
  sqxtn v4.4h, v24.4s
  sqxtn2 v4.8h, v25.4s
  sqxtn v5.4h, v26.4s
  sqxtn2 v5.8h, v27.4s
  sqxtn v6.4h, v28.4s
  sqxtn2 v6.8h, v29.4s
  sqxtn v7.4h, v30.4s
  sqxtn2 v7.8h, v31.4s

  // int16 -> int8
  sqxtn v8.8b, v0.8h
  sqxtn2 v8.16b, v1.8h
  sqxtn v9.8b, v2.8h
  sqxtn2 v9.16b, v3.8h
  sqxtn v10.8b, v4.8h
  sqxtn2 v10.16b, v5.8h
  sqxtn v11.8b, v6.8h
  sqxtn2 v11.16b, v7.8h

  cmp w23, #8
  blt Write     // if rows < 8
  cmp w15, #8
  blt Write     // if cols < 8

  st1 {v8.d}[0], [x2], x24
  st1 {v8.d}[1], [x2], x24
  st1 {v9.d}[0], [x2], x24
  st1 {v9.d}[1], [x2], x24
  st1 {v10.d}[0], [x2], x24
  st1 {v10.d}[1], [x2], x24
  st1 {v11.d}[0], [x2], x24
  st1 {v11.d}[1], [x2], x24
  b Endwrite

Write:
  cmp w15, #8
  bge WriteCol8
  cmp w15, #7
  beq WriteCol7
  cmp w15, #6
  beq WriteCol6
  cmp w15, #5
  beq WriteCol5
  cmp w15, #4
  beq WriteCol4
  cmp w15, #3
  beq WriteCol3
  cmp w15, #2
  beq WriteCol2
  cmp w15, #1
  beq WriteCol1

WriteCol8:
  st1 {v8.d}[0], [x2], x24
  cmp w23, #1
  beq Endwrite
  st1 {v8.d}[1], [x2], x24
  cmp w23, #2
  beq Endwrite
  st1 {v9.d}[0], [x2], x24
  cmp w23, #3
  beq Endwrite
  st1 {v9.d}[1], [x2], x24
  cmp w23, #4
  beq Endwrite
  st1 {v10.d}[0], [x2], x24
  cmp w23, #5
  beq Endwrite
  st1 {v10.d}[1], [x2], x24
  cmp w23, #6
  beq Endwrite
  st1 {v11.d}[0], [x2], x24
  cmp w23, #7
  beq Endwrite
  st1 {v11.d}[1], [x2], x24
  b Endwrite

WriteCol7:
  mov x26, x2
  st1 {v8.s}[0], [x26], #4
  st1 {v8.h}[2], [x26], #2
  st1 {v8.b}[6], [x26], #1
  add x2, x2, x24
  cmp w23, #1
  beq Endwrite
  mov x26, x2
  st1 {v8.s}[2], [x26], #4
  st1 {v8.h}[6], [x26], #2
  st1 {v8.b}[14], [x26], #1
  add x2, x2, x24
  cmp w23, #2
  beq Endwrite
  mov x26, x2
  st1 {v9.s}[0], [x26], #4
  st1 {v9.h}[2], [x26], #2
  st1 {v9.b}[6], [x26], #1
  add x2, x2, x24
  cmp w23, #3
  beq Endwrite
  mov x26, x2
  st1 {v9.s}[2], [x26], #4
  st1 {v9.h}[6], [x26], #2
  st1 {v9.b}[14], [x26], #1
  add x2, x2, x24
  cmp w23, #4
  beq Endwrite
  mov x26, x2
  st1 {v10.s}[0], [x26], #4
  st1 {v10.h}[2], [x26], #2
  st1 {v10.b}[6], [x26], #1
  add x2, x2, x24
  cmp w23, #5
  beq Endwrite
  mov x26, x2
  st1 {v10.s}[2], [x26], #4
  st1 {v10.h}[6], [x26], #2
  st1 {v10.b}[14], [x26], #1
  add x2, x2, x24
  cmp w23, #6
  beq Endwrite
  mov x26, x2
  st1 {v11.s}[0], [x26], #4
  st1 {v11.h}[2], [x26], #2
  st1 {v11.b}[6], [x26], #1
  add x2, x2, x24
  cmp w23, #7
  beq Endwrite
  mov x26, x2
  st1 {v11.s}[2], [x26], #4
  st1 {v11.h}[6], [x26], #2
  st1 {v11.b}[14], [x26], #1
  add x2, x2, x24
  b Endwrite

WriteCol6:
  mov x26, x2
  st1 {v8.s}[0], [x26], #4
  st1 {v8.h}[2], [x26], #2
  add x2, x2, x24
  cmp w23, #1
  beq Endwrite
  mov x26, x2
  st1 {v8.s}[2], [x26], #4
  st1 {v8.h}[6], [x26], #2
  add x2, x2, x24
  cmp w23, #2
  beq Endwrite
  mov x26, x2
  st1 {v9.s}[0], [x26], #4
  st1 {v9.h}[2], [x26], #2
  add x2, x2, x24
  cmp w23, #3
  beq Endwrite
  mov x26, x2
  st1 {v9.s}[2], [x26], #4
  st1 {v9.h}[6], [x26], #2
  add x2, x2, x24
  cmp w23, #4
  beq Endwrite
  mov x26, x2
  st1 {v10.s}[0], [x26], #4
  st1 {v10.h}[2], [x26], #2
  add x2, x2, x24
  cmp w23, #5
  beq Endwrite
  mov x26, x2
  st1 {v10.s}[2], [x26], #4
  st1 {v10.h}[6], [x26], #2
  add x2, x2, x24
  cmp w23, #6
  beq Endwrite
  mov x26, x2
  st1 {v11.s}[0], [x26], #4
  st1 {v11.h}[2], [x26], #2
  add x2, x2, x24
  cmp w23, #7
  beq Endwrite
  mov x26, x2
  st1 {v11.s}[2], [x26], #4
  st1 {v11.h}[6], [x26], #2
  add x2, x2, x24
  b Endwrite

WriteCol5:
  mov x26, x2
  st1 {v8.s}[0], [x26], #4
  st1 {v8.b}[4], [x26], #1
  add x2, x2, x24
  cmp w23, #1
  beq Endwrite
  mov x26, x2
  st1 {v8.s}[2], [x26], #4
  st1 {v8.b}[12], [x26], #1
  add x2, x2, x24
  cmp w23, #2
  beq Endwrite
  mov x26, x2
  st1 {v9.s}[0], [x26], #4
  st1 {v9.b}[4], [x26], #1
  add x2, x2, x24
  cmp w23, #3
  beq Endwrite
  mov x26, x2
  st1 {v9.s}[2], [x26], #4
  st1 {v9.b}[12], [x26], #1
  add x2, x2, x24
  cmp w23, #4
  beq Endwrite
  mov x26, x2
  st1 {v10.s}[0], [x26], #4
  st1 {v10.b}[4], [x26], #1
  add x2, x2, x24
  cmp w23, #5
  beq Endwrite
  mov x26, x2
  st1 {v10.s}[2], [x26], #4
  st1 {v10.b}[12], [x26], #1
  add x2, x2, x24
  cmp w23, #6
  beq Endwrite
  mov x26, x2
  st1 {v11.s}[0], [x26], #4
  st1 {v11.b}[4], [x26], #1
  add x2, x2, x24
  cmp w23, #7
  beq Endwrite
  mov x26, x2
  st1 {v11.s}[2], [x26], #4
  st1 {v11.b}[12], [x26], #1
  add x2, x2, x24
  b Endwrite

WriteCol4:
  st1 {v8.s}[0], [x2], x24
  cmp w23, #1
  beq Endwrite
  st1 {v8.s}[2], [x2], x24
  cmp w23, #2
  beq Endwrite
  st1 {v9.s}[0], [x2], x24
  cmp w23, #3
  beq Endwrite
  st1 {v9.s}[2], [x2], x24
  cmp w23, #4
  beq Endwrite
  st1 {v10.s}[0], [x2], x24
  cmp w23, #5
  beq Endwrite
  st1 {v10.s}[2], [x2], x24
  cmp w23, #6
  beq Endwrite
  st1 {v11.s}[0], [x2], x24
  cmp w23, #7
  beq Endwrite
  st1 {v11.s}[2], [x2], x24
  b Endwrite

WriteCol3:
  mov x26, x2
  st1 {v8.h}[0], [x26], #2
  st1 {v8.b}[2], [x26], #1
  add x2, x2, x24
  cmp w23, #1
  beq Endwrite
  mov x26, x2
  st1 {v8.h}[4], [x26], #2
  st1 {v8.b}[10], [x26], #1
  add x2, x2, x24
  cmp w23, #2
  beq Endwrite
  mov x26, x2
  st1 {v9.h}[0], [x26], #2
  st1 {v9.b}[2], [x26], #1
  add x2, x2, x24
  cmp w23, #3
  beq Endwrite
  mov x26, x2
  st1 {v9.h}[4], [x26], #2
  st1 {v9.b}[10], [x26], #1
  add x2, x2, x24
  cmp w23, #4
  beq Endwrite
  mov x26, x2
  st1 {v10.h}[0], [x26], #2
  st1 {v10.b}[2], [x26], #1
  add x2, x2, x24
  cmp w23, #5
  beq Endwrite
  mov x26, x2
  st1 {v10.h}[4], [x26], #2
  st1 {v10.b}[10], [x26], #1
  add x2, x2, x24
  cmp w23, #6
  beq Endwrite
  mov x26, x2
  st1 {v11.h}[0], [x26], #2
  st1 {v11.b}[2], [x26], #1
  add x2, x2, x24
  cmp w23, #7
  beq Endwrite
  mov x26, x2
  st1 {v11.h}[4], [x26], #2
  st1 {v11.b}[10], [x26], #1
  add x2, x2, x24
  b Endwrite

WriteCol2:
  st1 {v8.h}[0], [x2], x24
  cmp w23, #1
  beq Endwrite
  st1 {v8.h}[4], [x2], x24
  cmp w23, #2
  beq Endwrite
  st1 {v9.h}[0], [x2], x24
  cmp w23, #3
  beq Endwrite
  st1 {v9.h}[4], [x2], x24
  cmp w23, #4
  beq Endwrite
  st1 {v10.h}[0], [x2], x24
  cmp w23, #5
  beq Endwrite
  st1 {v10.h}[4], [x2], x24
  cmp w23, #6
  beq Endwrite
  st1 {v11.h}[0], [x2], x24
  cmp w23, #7
  beq Endwrite
  st1 {v11.h}[4], [x2], x24
  b Endwrite

WriteCol1:
  st1 {v8.b}[0], [x2], x24
  cmp w23, #1
  beq Endwrite
  st1 {v8.b}[8], [x2], x24
  cmp w23, #2
  beq Endwrite
  st1 {v9.b}[0], [x2], x24
  cmp w23, #3
  beq Endwrite
  st1 {v9.b}[8], [x2], x24
  cmp w23, #4
  beq Endwrite
  st1 {v10.b}[0], [x2], x24
  cmp w23, #5
  beq Endwrite
  st1 {v10.b}[8], [x2], x24
  cmp w23, #6
  beq Endwrite
  st1 {v11.b}[0], [x2], x24
  cmp w23, #7
  beq Endwrite
  st1 {v11.b}[8], [x2], x24
  b Endwrite

Endwrite:  
  sub w16, w16, #8      // a row8 counter - 8
  sub w23, w23, #8      // a row counter - 8
  b L2

End2:
  sub w4, w4, #8        // b col8 counter - 8
  sub w15, w15, #8      // b col counter - 8
  add x1, x1, x21       // b ptr + stride
  add x7, x7, #32       // bias ptr + stride
  add x25, x25, #8      // output + stride(8 * sizeof(int8))
  mov x2, x25

  cmp w27, #0
  beq PerTEnd2
  add x12, x12, #32
  add x11, x11, #32
  add x13, x13, #32
PerTEnd2:
  b L1

End1:
  sub sp, sp, #208
  ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
  ldp x19, x20, [sp], #16
  ldp x21, x22, [sp], #16
  ldp x23, x24, [sp], #16
  ldp x25, x26, [sp], #16
  ldp x27, x28, [sp], #16
  ret
#endif
